<<<<<<< HEAD ======= >>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb <<<<<<< HEAD ======= >>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb <<<<<<< HEAD aligning_song_types.utf8 ======= >>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb

Clean session and load packages

#clean session
rm(list = ls())

# unload all non-based packages
<<<<<<< HEAD
out <- sapply(paste('package:', names(sessionInfo()$otherPkgs), sep = ""), function(x) try(detach(x, unload = FALSE, character.only = TRUE), silent = T))

## vector with package names
x <- c( "pbapply", "parallel", "ggplot2", "ape", "seqinr", "ips","gtools", "seqmagick", "ggmsa", "cowplot", "tidyr"
=======
out <- sapply(paste('package:', names(sessionInfo()$otherPkgs), sep = ""), function(x) try(detach(x, unload = FALSE, character.only = TRUE), silent = T))

## vector with package names
x <- c( "pbapply", "parallel", "ggplot2", "ape", "seqinr", "ips","gtools", "seqmagick", "ggmsa", "cowplot"
>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb
        )

aa <- lapply(x, function(y) {
  
  # check if installed, if not then install 
  if (!y %in% installed.packages()[,"Package"]) 
    install.packages(y) 

  # load package
  try(require(y, character.only = T), silent = T)
})

Functions and parameters

#functions and parameters
knitr::opts_knit$set(root.dir = normalizePath(".."))

knitr::opts_chunk$set(dpi = 50, fig.width = 12) 

# ggplot2 theme
theme_set(theme_classic(base_size = 20))

split_seqs <- function(x) {
 
  # from start to end
  y <-  sapply(1:20, function(y) 
     if (y == 1)
    ifelse(substr(x, start = y + 1, stop = y + 1) == substr(x, start = y, stop = y), 0 , 1) else
          ifelse(substr(x, start = y - 1, stop = y - 1) == substr(x, start = y, stop = y), 0 , 1)
    )

  x2 <- strsplit(x, "")[[1]]
  
  w1 <- which(y == 1)
  
  # to add single elements at the end
    w1 <- c(w1, 21)
  if (w1[1] == 1 & w1[2] == 2) w1 <- w1[2:length(w1)]
    
  sgmts <- lapply(1:length(w1), function(w){
    if (w1[w] == w1[1]) paste(x2[1:(w1[w] - 1)], collapse = "")  else
  paste(x2[w1[w - 1]:(w1[w] - 1)], collapse = "")  
  })

  return(sgmts)
}

write_lek_fas <- function(x){
  
 lk <- unlist(strsplit(x[seq(1, length(x), by = 2)], split = "_"))[seq(1, length(x), by = 2)]
  
 lk <- gsub(">", "", paste0(lk, "_"))
 
 out <- sapply(unique(lk), USE.NAMES = FALSE, function(y){
   
   
   wch.lk <- grep(pattern = y, x = x)
   
   fl.nms <-tempfile(tmpdir = getwd(), fileext =  paste0("song.seq_",y, "SongsSeq.fa"))
   
   writeLines(x[min(wch.lk):(max(wch.lk)+ 1)], con = fl.nms)
   
   return(basename(fl.nms))
 })

 return(out)  
}

# get leks with more than 5 song types
leks <- c("BR1", "CCE", "CCL", "HC1", "HC2", "LOC", "SAT", "SJA", "SUR", "TR1", "TR2")
<<<<<<< HEAD

Selecting leks for analysis

dat <- read.csv("./data/raw/segments_by_song_type.csv", stringsAsFactors = FALSE)

yrs <- read.csv("./data/raw/year_range_by_song_type.csv", stringsAsFactors = FALSE)

#song types per lek
stl <- tapply(dat$song.type, dat$lek, length)

#year per lek
ypl <- tapply(yrs$year, yrs$lek, function(x) length(unique(x)))

# year range per lek
yrpl <- tapply(yrs$year, yrs$lek, function(x) max(x) - min(x))

# year range per lek
gpl <- yrpl - ypl + 1
set.seed(1011)
rnd_var <- rnorm(n = length(stl), mean = 2, sd = 1)

cols <- rep("white", length(gpl))
cols[which(names(gpl) %in%  c("SUR", "CCE", "HC1", "BR1", "TR1"))] <- viridis::viridis(10, alpha = 0.5)[3]

plot(stl, ypl + rnd_var, col = cols, xlab = "# of song types", ylab = "# of years sampled", pch = 20, cex = 9)
text(stl, ypl+ rnd_var, labels = names(stl), cex = 2)

plot(stl, yrpl+ rnd_var, col = cols, cex = 9, pch = 20, xlab = "# of song types", ylab = "year range")
text(stl, yrpl+ rnd_var, labels = names(stl), cex = 2)

plot(gpl, yrpl+  rnd_var, col = cols, cex = 9, pch = 20, xlab = "# of year gaps", ylab = "year range")
text(gpl, yrpl+ rnd_var, labels = names(stl), cex = 2)

#selected leks
sel_leks <- c("SUR", "CCE", "HC1", "BR1", "TR1")
======= >>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb

Create fastas for mafft

# create fastas for each lek

###----Alignment of phenotypic sequence data: CSV to fasta format----------------------

dat <- read.csv("./data/raw/segments_by_song_type.csv", stringsAsFactors = FALSE)
yrs <- read.csv("./data/raw/year_range_by_song_type.csv", stringsAsFactors = FALSE)

<<<<<<< HEAD
#Are id's unique?
=======
#Are id's unique?
>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb
length(dat$song.type) == length(unique(dat$song.type)) #OK

# seq as a single column
dat$seqs <- sapply(1:nrow(dat), function(i)  paste(dat[i,3:22], collapse = "")
)

# one for each songtype year
dat.yr <- yrs

dat.yr$song.type.year <- paste(dat.yr$song.type, dat.yr$year, sep = "-")

<<<<<<< HEAD
dat.yr <- dat.yr[dat.yr$lek %in% sel_leks, ]
=======
dat.yr <- dat.yr[dat.yr$lek %in% leks, ]
>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb


dat.yr <- merge(dat.yr, dat[, c("song.type", "seqs")], by = "song.type")


# only 1 year per song type and using before 2008
dat.all.old <- dat.yr[order(dat.yr$song.type, dat.yr$year), ]

# only 1 year per song type and using before 2008
dat.early.old <- dat.all.old[!duplicated(dat.all.old$song.type), ]

# only 1 year per song type and exlcuding before 2008
<<<<<<< HEAD
dat.all.new <- dat.all.old[dat.all.old$year > 2000, ]

# only 1 year per song type and using before 2008
dat.early.new <- dat.early.old[dat.early.old$year > 2000, ]
=======
dat.all.new <- dat.all.old[dat.all.old$year > 2007, ]

# only 1 year per song type and using before 2008
dat.early.new <- dat.early.old[dat.early.old$year > 2008, ]
>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb

# rest of song type years (without earliest)
dat.rest.old <- dat.all.old[!dat.all.old$song.type.year %in% dat.early.old$song.type.year,]

dat.rest.new <- dat.all.new[!dat.all.new$song.type.year %in% dat.early.new$song.type.year,]


# by lek all song types
<<<<<<< HEAD
for(i in sel_leks){

    
# all old
if (any(dat.yr$year[dat.yr$lek == i] < 2000))
  write.fasta(sequences = as.list(dat.rest.old$seqs[dat.rest.old$lek == i]), names = dat.rest.old$song.type.year[dat.rest.old$lek == i], file.out = paste0("./data/processed/song_sequences/song_seqs_rest_old_", i, ".fa"), open = "w", nbchar = 60, as.string = FALSE)

# early old
if (any(dat.yr$year[dat.yr$lek == i] < 2000))
  write.fasta(sequences = as.list(dat.early.old$seqs[dat.early.old$lek == i]), names = dat.early.old$song.type.year[dat.early.old$lek == i], file.out = paste0("./data/processed/song_sequences/song_seqs_early_old_", i, ".fa"), open = "w", nbchar = 60, as.string = FALSE)
=======
for(i in leks){
  
# all old
write.fasta(sequences = as.list(dat.rest.old$seqs[dat.rest.old$lek == i]), names = dat.rest.old$song.type.year[dat.rest.old$lek == i], file.out = paste0("./data/processed/song_sequences/song_seqs_rest_old_", i, ".fa"), open = "w", nbchar = 60, as.string = FALSE)

# early old
write.fasta(sequences = as.list(dat.early.old$seqs[dat.early.old$lek == i]), names = dat.early.old$song.type.year[dat.early.old$lek == i], file.out = paste0("./data/processed/song_sequences/song_seqs_early_old_", i, ".fa"), open = "w", nbchar = 60, as.string = FALSE)
>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb

# all new
write.fasta(sequences = as.list(dat.rest.new$seqs[dat.rest.new$lek == i]), names = dat.rest.new$song.type.year[dat.rest.new$lek == i], file.out = paste0("./data/processed/song_sequences/song_seqs_rest_new_", i, ".fa"), open = "w", nbchar = 60, as.string = FALSE)

# early new
write.fasta(sequences = as.list(dat.early.new$seqs[dat.early.new$lek == i]), names = dat.early.new$song.type.year[dat.early.new$lek == i], file.out = paste0("./data/processed/song_sequences/song_seqs_early_new_", i, ".fa"), open = "w", nbchar = 60, as.string = FALSE)
  }

Aligning with mafft

# alignments real songs

# add using or not song types before 2008
use.old.fossils <- c("old", "new")

# type of costs 
type <- c("optimal", "all.equal")

grd.costs <- expand.grid(use.old.fossils = use.old.fossils, type = type)

unq.sgmts <- c("d", "f", "m", "p", "s", "u")

asc.val <- c("0x73", "0x75", "0x70", "0x6D", "0x66", "0x64")


cost <- matrix(c(0.5, 1.25, 2, 0.5, 1.25, 2, # optimal
               rep(c(1, 1, 1.5), 2)), # equal costs
               nrow = 4, byrow = TRUE)

cost.mat <- data.frame(cost, stringsAsFactors = FALSE)

names(cost.mat)[1:3] <- c("different.category", "same.category", "same.segment")

cost.mat <- data.frame(cost.mat, grd.costs, stringsAsFactors = FALSE)


# grid cost values
grd.sgm <- expand.grid(sgmt1 = unq.sgmts, sgmt2 = unq.sgmts)

grd.sgm$type <- "different.category"

trlls <- c("f", "m", "s")
tones <- c("d", "p", "u")
grd.sgm$type[grd.sgm$sgmt1 %in% trlls & grd.sgm$sgmt2 %in% trlls] <-  "same.category"
grd.sgm$type[grd.sgm$sgmt1 %in% tones & grd.sgm$sgmt2 %in% tones] <- "same.category"
grd.sgm$type[grd.sgm$sgmt1 == grd.sgm$sgmt2] <- "same.segment"

# create combs
cost.grds <- pblapply(1:nrow(cost.mat), cl = 3, function(x){
  
  grd.sgm$cost <- NA
  grd.sgm$cost[grd.sgm$type == "different.category"] <- cost.mat$different.category[x]
  grd.sgm$cost[grd.sgm$type == "same.category"] <- cost.mat$same.category[x]
  grd.sgm$cost[grd.sgm$type == "same.segment"] <- cost.mat$same.segment[x]

  grd.sgm$pair <- paste(grd.sgm$sgmt1, "x", grd.sgm$sgmt2)

  grd.sgm$asc1 <- grd.sgm$sgmt1
  grd.sgm$asc2 <- grd.sgm$sgmt2
  
  for(i in 1:length(unq.sgmts)){
    grd.sgm$asc1 <- gsub(unq.sgmts[i], asc.val[i], grd.sgm$asc1)
    grd.sgm$asc2 <- gsub(unq.sgmts[i], asc.val[i], grd.sgm$asc2)
  }

  grd.sgm$cost.type <- cost.mat$type[x]
  grd.sgm$cost.vals <- cost.mat$cost.vals[x]
  
  grd.sgm$use.old.fossils <- cost.mat$use.old.fossils[x]
  
  return(grd.sgm)
})

length(cost.grds)

# get leks with more than 5 song types
# leks <- names(table(sgmnts$lek))[table(sgmnts$lek)> 5]


pboptions(type = "timer")

yrs <- read.csv("./data/raw/year_range_by_song_type.csv", stringsAsFactors = FALSE)

# exclude all  equal
# cost.grds <- cost.grds[1]

out <- pblapply(1:length(cost.grds), cl = parallel::detectCores() - 1, function(y){
  
  Y <- cost.grds[[y]]
  w <- apply(Y[, c("asc1", "asc2", "cost")], 1, paste,  collapse = " ")
  
  writeLines(w, con = paste0("./data/processed/mafft_alignments/",Y$cost.type[1],"_cost_matrix.txt"))
  
<<<<<<< HEAD
  
  out <- lapply(sel_leks, function(i){
    
    # don't do those with no data before 2008
=======
  out <- lapply(leks, function(i){
    
    # don't do those with no data before 2008
>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb
    if (min(yrs$year[yrs$lek == i]) > 2000 & Y$use.old.fossils[1] == "old")
      return(NULL) else
      {      
          cll <- paste0("mafft --localpair --maxiterate 50000 --textmatrix ./data/processed/mafft_alignments/",Y$cost.type[1] ,"_cost_matrix.txt ./data/processed/song_sequences/song_seqs_early_", Y$use.old.fossils[1], "_", i,".fa > ./data/processed/mafft_alignments/", i, "_", Y$cost.type[1], "_early_",  Y$use.old.fossils[1],  "_alignment.fa 2> ./data/processed/mafft_alignments/early_", Y$use.old.fossils[1], "_", i,"_", Y$cost.type[1], "_mafft_output.txt")
  
      system(cll)
    
    alg <- readLines(paste0("./data/processed/mafft_alignments/",i,"_", Y$cost.type[1], "_early_",  Y$use.old.fossils[1], "_alignment.fa"))[2]
    gaps <- nchar(alg) - 20
    
    convg <- ifelse(any(grepl("Converged", readLines(paste0("./data/processed/mafft_alignments/early_", Y$use.old.fossils[1], "_", i ,"_", Y$cost.type[1],"_mafft_output.txt")))), "Y", "N")
    
    df2 <- data.frame(lek = i, gaps, convg, cost.type = Y$cost.type[1], use.old.fossils = Y$use.old.fossils[1])
    
    return(df2)
    } 
  })
  
  out <- out[!sapply(out, is.null)]
  
  res <- do.call(rbind, out)
  
  return(res)
})

res <- do.call(rbind, out)

write.csv(res, "./output/aligment_gaps_and_convergence_results.csv")

Add new sequences to mafft alignments

#add new sequences only for optimal so far

alg <- list.files(path = "./data/processed/mafft_alignments", pattern = "alignment.fa$", full.names = TRUE)

<<<<<<< HEAD
alg <- grep("early", alg, value = TRUE)

alg <- grep("prank", alg, value = TRUE, invert = TRUE)

# and new
rest.seqs <- list.files(path = "./data/processed/song_sequences", pattern = "rest", full.names = TRUE)

# remove the prank ones
rest.seqs <- grep("prank", rest.seqs, value = TRUE, invert = T)


out <- lapply(sel_leks, function(x){
=======
# and new
rest.seqs <- list.files(path = "./data/processed/song_sequences", pattern = "rest", full.names = TRUE)


out <- lapply(leks, function(x){
>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb
  
  # get alignments for x lek
  lk.alg <- grep(x, alg, value = TRUE)

  for(w  in lk.alg){
    cost_mat <- ifelse(grepl("optimal", w), "optimal", "all.equal")
    use_old_fossils <- ifelse(grepl("old", w), "old", "new")
  
<<<<<<< HEAD
    cll <- paste0("mafft --anysymbol --keeplength --add ", grep(paste0(use_old_fossils, "_", x), rest.seqs, value = TRUE)," --reorder ", file.path("./data/processed/mafft_alignments", paste(x, cost_mat, "early", use_old_fossils ,"alignment.fa", sep = "_")), " > ",     file.path("./data/processed/mafft_alignments", paste(x, cost_mat, "all", use_old_fossils ,"alignment.fa", sep = "_")))
=======
    cll <- paste0("mafft --anysymbol --keeplength --add ", grep(paste0("new_", x), rest.seqs, value = TRUE)," --reorder ", file.path("./data/processed/mafft_alignments", paste(x, cost_mat, "early", use_old_fossils ,"alignment.fa", sep = "_")), " > ",     file.path("./data/processed/mafft_alignments", paste(x, cost_mat, "all", use_old_fossils ,"alignment.fa", sep = "_")))
>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb

  
    system(cll)
    }
  })

Create fastas for prank

# create fastas for each lek

###----Alignment of phenotypic sequence data: CSV to fasta format----------------------

dat <- read.csv("./data/raw/segments_by_song_type.csv", stringsAsFactors = FALSE)
yrs <- read.csv("./data/raw/year_range_by_song_type.csv", stringsAsFactors = FALSE)

<<<<<<< HEAD
#Are id's unique?
=======
#Are id's unique?
>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb
length(dat$song.type) == length(unique(dat$song.type)) #OK

# seq as a single column
dat$seqs <- sapply(1:nrow(dat), function(i)  paste(dat[i,3:22], collapse = "")
)

# replace song segments with bases
dat$seqs <- gsub(pattern = "f", replacement = "C", x = dat$seqs) # trill to pyrimidine
  dat$seqs <- gsub(pattern = "s", replacement = "T", x = dat$seqs) # trill to pyrimidine
  dat$seqs <- gsub(pattern = "m", replacement = "Y", x = dat$seqs) # medium trill treated as ambiguous between fast and slow
  dat$seqs <- gsub(pattern = "u", replacement = "A", x = dat$seqs) # tone to purine
  dat$seqs <- gsub(pattern = "d", replacement = "G", x = dat$seqs) # tone to purine
  dat$seqs <- gsub(pattern = "p", replacement = "R", x = dat$seqs)


# one for each songtype year
dat.yr <- yrs

dat.yr$song.type.year <- paste(dat.yr$song.type, dat.yr$year, sep = "-")

<<<<<<< HEAD
dat.yr <- dat.yr[dat.yr$lek %in% sel_leks, ]
=======
dat.yr <- dat.yr[dat.yr$lek %in% leks, ]
>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb


dat.yr <- merge(dat.yr, dat[, c("song.type", "seqs")], by = "song.type")


# only 1 year per song type and using before 2008
dat.all.old <- dat.yr[order(dat.yr$song.type, dat.yr$year), ]

# only 1 year per song type and using before 2008
dat.early.old <- dat.all.old[!duplicated(dat.all.old$song.type), ]

# only 1 year per song type and exlcuding before 2008
dat.all.new <- dat.all.old[dat.all.old$year > 2000, ]

# only 1 year per song type and using before 2008
dat.early.new <- dat.early.old[dat.early.old$year > 2000, ]

# rest of song type years (without earliest)
dat.rest.old <- dat.all.old[!dat.all.old$song.type.year %in% dat.early.old$song.type.year,]

dat.rest.new <- dat.all.new[!dat.all.new$song.type.year %in% dat.early.new$song.type.year,]


# by lek all song types
<<<<<<< HEAD
for(i in sel_leks){
  
# all old
  if (any(dat.yr$year[dat.yr$lek == i] < 2000))
write.fasta(sequences = as.list(dat.rest.old$seqs[dat.rest.old$lek == i]), names = dat.rest.old$song.type.year[dat.rest.old$lek == i], file.out = paste0("./data/processed/song_sequences/song_seqs_rest_old_", i, "_prank.fa"), open = "w", nbchar = 60, as.string = FALSE)

# early old
  if (any(dat.yr$year[dat.yr$lek == i] < 2000))
=======
for(i in leks){
  
# all old
write.fasta(sequences = as.list(dat.rest.old$seqs[dat.rest.old$lek == i]), names = dat.rest.old$song.type.year[dat.rest.old$lek == i], file.out = paste0("./data/processed/song_sequences/song_seqs_rest_old_", i, "_prank.fa"), open = "w", nbchar = 60, as.string = FALSE)

# early old
>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb
write.fasta(sequences = as.list(dat.early.old$seqs[dat.early.old$lek == i]), names = dat.early.old$song.type.year[dat.early.old$lek == i], file.out = paste0("./data/processed/song_sequences/song_seqs_early_old_", i, "_prank.fa"), open = "w", nbchar = 60, as.string = FALSE)

# all new
write.fasta(sequences = as.list(dat.rest.new$seqs[dat.rest.new$lek == i]), names = dat.rest.new$song.type.year[dat.rest.new$lek == i], file.out = paste0("./data/processed/song_sequences/song_seqs_rest_new_", i, "_prank.fa"), open = "w", nbchar = 60, as.string = FALSE)

# early new
write.fasta(sequences = as.list(dat.early.new$seqs[dat.early.new$lek == i]), names = dat.early.new$song.type.year[dat.early.new$lek == i], file.out = paste0("./data/processed/song_sequences/song_seqs_early_new_", i, "_prank.fa"), open = "w", nbchar = 60, as.string = FALSE)
  }

Aligning with prank

prank_seqs <- list.files(path = "./data/processed/song_sequences", pattern = "prank")

# get only early ones
prank_seqs_early <- grep("early", prank_seqs, value = TRUE) 

# get rest
prank_seqs_rest <- grep("rest", prank_seqs, value = TRUE) 

out <- pblapply(prank_seqs_early, function(x) {

  lek <- sapply(strsplit(x, "_", fixed = TRUE), "[[", 5)
  all.fossils <- sapply(strsplit(x, "_", fixed = TRUE), "[[", 4)
  old.fossils <- sapply(strsplit(x, "_", fixed = TRUE), "[[", 3)
  
  cll <- paste0("prank -d=./data/processed/song_sequences/", x, " -o=./data/processed/mafft_alignments/", lek, "_prank_", old.fossils, "_", all.fossils,"_alignment.fa -iterate=100 -kappa=2")

  system(cll)
  })



# remove best.fas suffix
prank_algn <- list.files(path = "./data/processed/mafft_alignments", pattern = "best.fas", full.name = TRUE)

file.rename(from = prank_algn, to = gsub(".best.fas$", "", prank_algn))

Add new sequences to prank alignments

#add new sequences only for optimal so far

alg <- list.files(path = "./data/processed/mafft_alignments", pattern = "prank", full.names = TRUE)

# and new
rest.seqs <- list.files(path = "./data/processed/song_sequences", pattern = "rest", full.names = TRUE)

rest.seqs <- grep("prank", rest.seqs, value = TRUE)

out <- lapply(leks, function(x){
  
  # get alignments for x lek
  lk.alg <- grep(x, alg, value = TRUE)

  for(w  in lk.alg){
    cost_mat <- "prank"
    use_old_fossils <- ifelse(grepl("old", w), "old", "new")
  
<<<<<<< HEAD
    cll <- paste0("mafft --anysymbol --keeplength --add ", grep(paste0(use_old_fossils, "_", x), rest.seqs, value = TRUE)," --reorder ", file.path("./data/processed/mafft_alignments", paste(x, cost_mat, "early", use_old_fossils ,"alignment.fa", sep = "_")), " > ",     file.path("./data/processed/mafft_alignments", paste(x, cost_mat, "all", use_old_fossils ,"alignment.fa", sep = "_")))
=======
    cll <- paste0("mafft --anysymbol --keeplength --add ", grep(paste0("new_", x), rest.seqs, value = TRUE)," --reorder ", file.path("./data/processed/mafft_alignments", paste(x, cost_mat, "early", use_old_fossils ,"alignment.fa", sep = "_")), " > ",     file.path("./data/processed/mafft_alignments", paste(x, cost_mat, "all", use_old_fossils ,"alignment.fa", sep = "_")))
>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb

    system(cll)
    }
  })
<<<<<<< HEAD
======= >>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb

Plot alignments

Mafft optimal costs:

  • diff type = 0.5 (i.e. trill vs pure tone)
  • same category = 1.25 (i.e. slow trill vs fast trill)
  • same segment = 2 (i.e. slow trill vs slow trill)

Mafft agonostics (all equal) costs:

  • diff type = 1.1 (i.e. trill vs pure tone)
  • same category = 1.1 (i.e. slow trill vs fast trill)
  • same segment = 1 (i.e. slow trill vs slow trill)

Prank: probabilistic multiple alignment program based on a novel algorithm that treats insertions correctly and avoids over-estimation of the number of deletion events

Make plots

#plot alignments
seqs <- list.files(path = "./data/processed/mafft_alignments", pattern = "\\.fa$", full.names = TRUE)

# remove those wtih repeated species
seqs <- grep("early", seqs, value = TRUE)

algn_plots <- lapply(seqs, function(i){
  ggmsa(i, color = "Chemistry_AA") + ggtitle(gsub("_|\\.fa", " ", basename(i)))
  })

names(algn_plots) <- basename(seqs)

algn_plots
## $BR1_all.equal_early_new_alignment.fa
<<<<<<< HEAD

## 
## $BR1_optimal_early_new_alignment.fa

## 
## $BR1_prank_early_new_alignment.fa

## 
## $CCE_all.equal_early_new_alignment.fa

## 
## $CCE_all.equal_early_old_alignment.fa

## 
## $CCE_optimal_early_new_alignment.fa

## 
## $CCE_optimal_early_old_alignment.fa

## 
## $CCE_prank_early_new_alignment.fa

## 
## $CCE_prank_early_old_alignment.fa

## 
## $HC1_all.equal_early_new_alignment.fa

## 
## $HC1_all.equal_early_old_alignment.fa

## 
## $HC1_optimal_early_new_alignment.fa

## 
## $HC1_optimal_early_old_alignment.fa

## 
## $HC1_prank_early_new_alignment.fa

## 
## $HC1_prank_early_old_alignment.fa

## 
## $SUR_all.equal_early_new_alignment.fa

## 
## $SUR_all.equal_early_old_alignment.fa

## 
## $SUR_optimal_early_new_alignment.fa

## 
## $SUR_optimal_early_old_alignment.fa

## 
## $SUR_prank_early_new_alignment.fa

## 
## $SUR_prank_early_old_alignment.fa

## 
## $TR1_all.equal_early_new_alignment.fa

## 
## $TR1_optimal_early_new_alignment.fa

## 
## $TR1_prank_early_new_alignment.fa

=======

## 
## $BR1_optimal_early_new_alignment.fa

## 
## $BR1_prank_early_new_alignment.fa

## 
## $BR1_prank_early_old_alignment.fa

## 
## $CCE_all.equal_early_new_alignment.fa

## 
## $CCE_all.equal_early_old_alignment.fa

## 
## $CCE_optimal_early_new_alignment.fa

## 
## $CCE_optimal_early_old_alignment.fa

## 
## $CCE_prank_early_new_alignment.fa

## 
## $CCE_prank_early_old_alignment.fa

## 
## $CCL_all.equal_early_new_alignment.fa

## 
## $CCL_all.equal_early_old_alignment.fa

## 
## $CCL_optimal_early_new_alignment.fa

## 
## $CCL_optimal_early_old_alignment.fa

## 
## $CCL_prank_early_new_alignment.fa

## 
## $CCL_prank_early_old_alignment.fa

## 
## $HC1_all.equal_early_new_alignment.fa

## 
## $HC1_all.equal_early_old_alignment.fa

## 
## $HC1_optimal_early_new_alignment.fa

## 
## $HC1_optimal_early_old_alignment.fa

## 
## $HC1_prank_early_new_alignment.fa

## 
## $HC1_prank_early_old_alignment.fa

## 
## $HC2_all.equal_early_new_alignment.fa

## 
## $HC2_all.equal_early_old_alignment.fa

## 
## $HC2_optimal_early_new_alignment.fa

## 
## $HC2_optimal_early_old_alignment.fa

## 
## $HC2_prank_early_new_alignment.fa

## 
## $HC2_prank_early_old_alignment.fa

## 
## $LOC_all.equal_early_new_alignment.fa

## 
## $LOC_optimal_early_new_alignment.fa

## 
## $LOC_prank_early_new_alignment.fa

## 
## $LOC_prank_early_old_alignment.fa

## 
## $SAT_all.equal_early_new_alignment.fa

## 
## $SAT_all.equal_early_old_alignment.fa

## 
## $SAT_optimal_early_new_alignment.fa

## 
## $SAT_optimal_early_old_alignment.fa

## 
## $SAT_prank_early_new_alignment.fa

## 
## $SAT_prank_early_old_alignment.fa

## 
## $SJA_all.equal_early_new_alignment.fa

## 
## $SJA_all.equal_early_old_alignment.fa

## 
## $SJA_optimal_early_new_alignment.fa

## 
## $SJA_optimal_early_old_alignment.fa

## 
## $SJA_prank_early_new_alignment.fa

## 
## $SJA_prank_early_old_alignment.fa

## 
## $SUR_all.equal_early_new_alignment.fa

## 
## $SUR_all.equal_early_old_alignment.fa

## 
## $SUR_optimal_early_new_alignment.fa

## 
## $SUR_optimal_early_old_alignment.fa

## 
## $SUR_prank_early_new_alignment.fa

## 
## $SUR_prank_early_old_alignment.fa

## 
## $TR1_all.equal_early_new_alignment.fa

## 
## $TR1_optimal_early_new_alignment.fa

## 
## $TR1_prank_early_new_alignment.fa

## 
## $TR1_prank_early_old_alignment.fa

## 
## $TR2_all.equal_early_new_alignment.fa

## 
## $TR2_optimal_early_new_alignment.fa

## 
## $TR2_prank_early_new_alignment.fa

## 
## $TR2_prank_early_old_alignment.fa

>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb

Save nexus files

### Convert fasta alignment to nexus
# read all alignments
all.algs <- list.files(pattern = ".fa$", path = "./data/processed/mafft_alignments/")

# loop over alignments
for (j in all.algs){

<<<<<<< HEAD
  # lek <- substring(j, 0, 3) 
  # cost_mat <- if(grepl("optimal", j)) "optimal" 
  # cost_mat <- if(grepl("all.equal", j)) "all.equal" 
  # cost_mat <- if(grepl("prank", j)) "prank" 
  # 
  # use_old_fossils <- ifelse(grepl("old", j), "old", "new")
  # 
=======
  lek <- substring(j, 0, 3) 
  cost_mat <- if(grepl("optimal", j)) "optimal" 
  cost_mat <- if(grepl("all.equal", j)) "all.equal" 
  cost_mat <- if(grepl("prank", j)) "prank" 
  
  use_old_fossils <- ifelse(grepl("old", j), "old", "new")
  
>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb
  
  # paste together call
  cll <- paste0(" perl ./source/convertfasta2nex.pl ./data/processed/mafft_alignments/", j, " > ",  "./data/processed/nexus/", gsub("\\.fa$", ".nex", j))

 # call perl
 system(cll)  
}

# fix nexus header 
Nex.file <-list.files(pattern = ".nex$", path = "./data/processed/nexus/", full.names = TRUE)

<<<<<<< HEAD
 # replaces bases names and remove duplicate lek-song-type-years
for(i in Nex.file){  
   align <- readLines(i)
    align[4] <- 'format datatype=standard gap=- missing=? symbols="dfmpsu";'
    
    # change seqs back to sound segments
    if (grepl("prank", i)){
=======
 
for(i in Nex.file){  
   align <- readLines(i)
    align[4] <- 'format datatype=standard gap=- missing=? symbols="dfmpsu";'
    
    # change seqs back to sound segments
    if (grepl("prank", i)){

>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb
      seqs <- align[7:(length(align)-2)]   
      sep <- separate(data.frame(seqs = seqs),col = "seqs",sep =   "\t", into = c("song", "seqs")) 
      
      
      # replace song segments with bases
        sep$seqs <- gsub(pattern = "C", replacement = "f", x = sep$seqs) # trill to pyrimidine
        sep$seqs <- gsub(pattern = "T", replacement = "s", x = sep$seqs) # trill to pyrimidine
      sep$seqs <- gsub(pattern = "Y", replacement = "m", x = sep$seqs) # medium trill treated as ambiguous between fast and slow
      sep$seqs <- gsub(pattern = "A", replacement = "u", x = sep$seqs) # tone to purine
      sep$seqs <- gsub(pattern = "G", replacement = "d", x = sep$seqs) # tone to purine
      sep$seqs <- gsub(pattern = "R", replacement = "p", x = sep$seqs)
    
         align[7:(length(align)-2)] <- paste0(sep$song, "\t", sep$seqs)       
                   }
    write(align, i)
}
<<<<<<< HEAD

move latest try of the models to a different folder

# current models 
current.output <- list.files(pattern = "posterior.var",
                             path = "./output/revbayes",
                             recursive = TRUE)

done <- unique(sapply(current.output, function(x)
  paste(strsplit(x, "_", fixed = TRUE, "[[", 1)[[1]][1:5], collapse = "_"), USE.NAMES = FALSE))


# create expected models
#selected leks
sel_leks <- c("SUR", "CCE", "HC1", "BR1", "TR1")

dat <-
  read.csv("./data/raw/segments_by_song_type.csv", stringsAsFactors = FALSE)
yrs <-
  read.csv("./data/raw/year_range_by_song_type.csv", stringsAsFactors = FALSE)

yrs$song.type.year <- paste(yrs$song.type, yrs$year, sep = "-")
alignment   <- c("optimal", "all.equal", "prank") # all.equal = equal rates

# use all fosils (e.g. SURA-2011, SURA-2012, )
use_all_fossils <- c("all", "early")

# include also fossils before 2008
use_old_fossils <-  c("old", "new")

branch_rate_model <- c("global", "Uexp")

# make all possible combinations
grd <-
  expand.grid(
    lek = sel_leks,
    alignment = alignment,
    use_old_fossils = use_old_fossils,
    use_all_fossils = use_all_fossils,
    branch_rate_model = branch_rate_model
  )

grd <- grd[order(grd$lek),]

# remove "old" combinations without data before 2000s
for (i in sel_leks)
  if (!any(yrs$year[yrs$lek == i] < 2000))
    grd <- grd[!(grd$use_old_fossils == "old" & grd$lek == i),]

grd$output <- apply(grd, 1, paste, collapse = "_")

# missing models
setdiff(grd$output, mod_mat$model)


posteriors <- list.files(pattern = "posterior.var",
                             path = "./output/revbayes",
                             full.names = TRUE)

post_df <- data.frame(full_name = posteriors, model = sapply(basename(posteriors), function(x)
  paste(strsplit(x, "_", fixed = TRUE, "[[", 1)[[1]][1:5], collapse = "_"), USE.NAMES = FALSE))

tb <- table(post_df$model)

posteriors.info <- file.info(post_df$full_name)

 post_df$size <- posteriors.info$size

 post_df$mtime <- posteriors.info$mtime


out <- lapply(unique(post_df$model), FUN = function(x){
  
  
  Y <- post_df[post_df$model == x, ]
  
  W <- Y[which.max(Y$mtime), , drop = FALSE]
  
  return(W)
  
})

last_models <- do.call(rbind, out)

last_models$model_id <- sapply(last_models$full_name, function(x)
  strsplit(x, "_", fixed = TRUE, "[[", 1)[[1]][6], USE.NAMES = FALSE)


files_to_copy <- list.files(pattern = paste(last_models$model_id, collapse = "|"),
                             path = "./output/revbayes",
                             recursive = TRUE)

ids <- sapply(files_to_copy, function(x)
  strsplit(x, "_", fixed = TRUE, "[[", 1)[[1]][6], USE.NAMES = FALSE)

ids <- gsub(".trees|.log", "", ids)


length(ids) / 9 == 93


files_to_copy <- list.files(pattern = paste(last_models$model_id, collapse = "|"),
                             path = "./output/revbayes",
                             full.names = TRUE)

copy_res <- file.copy(from = files_to_copy, to = file.path("./output/most_recent_revbayes_models", basename(files_to_copy)))

all(copy_res)

R session information

## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
=======

R session information

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
## 
## locale:
<<<<<<< HEAD
##  [1] LC_CTYPE=pt_PT.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=es_CR.UTF-8        LC_COLLATE=pt_PT.UTF-8    
##  [5] LC_MONETARY=es_CR.UTF-8    LC_MESSAGES=pt_PT.UTF-8   
=======
##  [1] LC_CTYPE=es_ES.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=es_CR.UTF-8        LC_COLLATE=es_ES.UTF-8    
##  [5] LC_MONETARY=es_CR.UTF-8    LC_MESSAGES=es_ES.UTF-8   
>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb
##  [7] LC_PAPER=es_CR.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=es_CR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
<<<<<<< HEAD
##  [1] tidyr_1.1.2     cowplot_1.1.0   ggmsa_0.0.4     seqmagick_0.1.4
##  [5] gtools_3.8.2    ips_0.0.11      seqinr_4.2-4    ape_5.4-1      
##  [9] ggplot2_3.3.2   pbapply_1.4-3  
## 
## loaded via a namespace (and not attached):
##  [1] phangorn_2.5.5      progress_1.2.2      tidyselect_1.1.0   
##  [4] xfun_0.18           purrr_0.3.4         lattice_0.20-41    
##  [7] colorspace_1.4-1    vctrs_0.3.4         generics_0.1.0     
## [10] viridisLite_0.3.0   htmltools_0.5.0     stats4_4.0.3       
## [13] yaml_2.2.1          XML_3.99-0.5        rlang_0.4.8        
## [16] pillar_1.4.6        glue_1.4.2          withr_2.3.0        
## [19] BiocGenerics_0.36.0 lifecycle_0.2.0     plyr_1.8.6         
## [22] zlibbioc_1.36.0     stringr_1.4.0       Biostrings_2.58.0  
## [25] munsell_0.5.0       gtable_0.3.0        evaluate_0.14      
## [28] labeling_0.4.2      knitr_1.30          IRanges_2.24.0     
## [31] Rcpp_1.0.5          scales_1.1.1        S4Vectors_0.28.0   
## [34] XVector_0.30.0      farver_2.0.3        gridExtra_2.3      
## [37] fastmatch_1.1-0     hms_0.5.3           digest_0.6.27      
## [40] stringi_1.5.3       dplyr_1.0.2         grid_4.0.3         
## [43] ade4_1.7-16         quadprog_1.5-8      tools_4.0.3        
## [46] magrittr_1.5        tibble_3.0.4        crayon_1.3.4       
## [49] pkgconfig_2.0.3     ellipsis_0.3.1      MASS_7.3-53        
## [52] Matrix_1.2-18       prettyunits_1.1.1   viridis_0.5.1      
## [55] rmarkdown_2.5       R6_2.4.1            igraph_1.2.6       
## [58] nlme_3.1-149        compiler_4.0.3
======= ## [1] cowplot_1.0.0 ggmsa_0.0.2 seqmagick_0.1.3 gtools_3.8.1 ## [5] ips_0.0.11 seqinr_3.6-1 ape_5.3 ggplot2_3.3.0 ## [9] pbapply_1.4-2 ## ## loaded via a namespace (and not attached): ## [1] Rcpp_1.0.3 lattice_0.20-41 tidyr_1.0.0 ## [4] Biostrings_2.54.0 ggseqlogo_0.1 assertthat_0.2.1 ## [7] zeallot_0.1.0 digest_0.6.22 R6_2.4.1 ## [10] plyr_1.8.4 backports_1.1.5 stats4_3.6.1 ## [13] evaluate_0.14 pillar_1.4.2 zlibbioc_1.32.0 ## [16] rlang_0.4.1 lazyeval_0.2.2 phangorn_2.5.5 ## [19] S4Vectors_0.24.3 Matrix_1.2-18 rmarkdown_1.17 ## [22] labeling_0.3 stringr_1.4.0 igraph_1.2.4.1 ## [25] munsell_0.5.0 compiler_3.6.1 xfun_0.12 ## [28] pkgconfig_2.0.3 BiocGenerics_0.32.0 htmltools_0.4.0 ## [31] tidyselect_0.2.5 tibble_2.1.3 quadprog_1.5-8 ## [34] IRanges_2.20.2 XML_3.98-1.20 crayon_1.3.4 ## [37] dplyr_0.8.3 withr_2.1.2 MASS_7.3-51.4 ## [40] grid_3.6.1 nlme_3.1-142 jsonlite_1.6 ## [43] gtable_0.3.0 lifecycle_0.1.0 magrittr_1.5 ## [46] scales_1.0.0 tidytree_0.3.2 stringi_1.4.6 ## [49] XVector_0.26.0 ellipsis_0.3.0 vctrs_0.2.0 ## [52] fastmatch_1.1-0 tools_3.6.1 treeio_1.10.0 ## [55] ade4_1.7-13 glue_1.3.1 purrr_0.3.3 ## [58] yaml_2.2.1 colorspace_1.4-1 knitr_1.28
>>>>>>> f991318ff219e067032c3efec2125d00ece2d5eb